{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "7e618244-ed1c-49be-8e2c-3fe190fb397d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from collections import OrderedDict\n",
    "import datetime as dt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f1ef4c76-0310-4a31-a9d8-eca43cd213fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "ercot_files1 = [\n",
    "    '2004_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2005_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2006_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2007_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2008_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2009_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2010_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2011_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2012_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2013_ERCOT_Hourly_Load_Data.csv',\n",
    "    '2014_ERCOT_Hourly_Load_Data.csv',\n",
    "]\n",
    "\n",
    "ercot_files2 = [\n",
    "    \"native_Load_2015.csv\",\n",
    "    \"native_Load_2016.csv\",\n",
    "    \"native_Load_2017.csv\",\n",
    "    \"Native_Load_2018.csv\",\n",
    "    \"Native_Load_2019.csv\",\n",
    "    \"Native_Load_2020.csv\",\n",
    "    \"Native_Load_2021.csv\",\n",
    "]\n",
    "col_names = ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1d2e3958-fc76-4307-9331-efcda5a62e22",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_list = OrderedDict()\n",
    "data_dir = \"../../data/ercot/\" \n",
    "for name in ercot_files1 + ercot_files2:\n",
    "    df_list[name] = pd.read_csv(data_dir + name, dtype=str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c2c61eb6-9e58-406c-8b84-07fbd925c8fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2004_ERCOT_Hourly_Load_Data.csv 8784\n",
      "2005_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2006_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2007_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2008_ERCOT_Hourly_Load_Data.csv 8784\n",
      "2009_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2010_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2011_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2012_ERCOT_Hourly_Load_Data.csv 8784\n",
      "2013_ERCOT_Hourly_Load_Data.csv 8760\n",
      "2014_ERCOT_Hourly_Load_Data.csv 8760\n",
      "native_Load_2015.csv 8760\n",
      "native_Load_2016.csv 8784\n",
      "native_Load_2017.csv 8760\n",
      "Native_Load_2018.csv 8760\n",
      "Native_Load_2019.csv 8760\n",
      "Native_Load_2020.csv 8784\n",
      "Native_Load_2021.csv 5831\n"
     ]
    }
   ],
   "source": [
    "for n,x in df_list.items():\n",
    "    print(n, len(x))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "cbaf3c7a-607a-4119-9827-0b8d08145e53",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2004_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2005_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2006_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2007_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2008_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2009_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2010_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2011_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2012_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2013_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "2014_ERCOT_Hourly_Load_Data.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "native_Load_2015.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "native_Load_2016.csv ['Hour_End', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'ERCOT']\n",
      "native_Load_2017.csv ['Hour Ending', 'COAST', 'EAST', 'FWEST', 'NORTH', 'NCENT', 'SOUTH', 'SCENT', 'WEST', 'ERCOT']\n",
      "Native_Load_2018.csv ['HourEnding', 'COAST', 'EAST', 'FWEST', 'NORTH', 'NCENT', 'SOUTH', 'SCENT', 'WEST', 'ERCOT']\n",
      "Native_Load_2019.csv ['HourEnding', 'COAST', 'EAST', 'FWEST', 'NORTH', 'NCENT', 'SOUTH', 'SCENT', 'WEST', 'ERCOT']\n",
      "Native_Load_2020.csv ['HourEnding', 'COAST', 'EAST', 'FWEST', 'NORTH', 'NCENT', 'SOUTH', 'SCENT', 'WEST', 'ERCOT']\n",
      "Native_Load_2021.csv ['Hour Ending', 'COAST', 'EAST', 'FWEST', 'NORTH', 'NCENT', 'SOUTH', 'SCENT', 'WEST', 'ERCOT']\n"
     ]
    }
   ],
   "source": [
    "for n,x in df_list.items():\n",
    "    print(n, list(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2369405a-4489-468b-b468-40dc2a750661",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2004_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2005_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2006_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2007_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2008_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2009_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2010_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2011_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2012_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2013_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "2014_ERCOT_Hourly_Load_Data.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "native_Load_2015.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "native_Load_2016.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "native_Load_2017.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "Native_Load_2018.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "Native_Load_2019.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "Native_Load_2020.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n",
      "Native_Load_2021.csv ['ds', 'COAST', 'EAST', 'FAR_WEST', 'NORTH', 'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST', 'y']\n"
     ]
    }
   ],
   "source": [
    "for n,x in df_list.items():\n",
    "    df_list[n].columns = col_names\n",
    "    \n",
    "for n,x in df_list.items():\n",
    "    print(n, list(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "141df170-3d72-4bd5-8cf3-0d109ebbb4a3",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2004_ERCOT_Hourly_Load_Data.csv\n",
      "2005_ERCOT_Hourly_Load_Data.csv\n",
      "2006_ERCOT_Hourly_Load_Data.csv\n",
      "2007_ERCOT_Hourly_Load_Data.csv\n",
      "2008_ERCOT_Hourly_Load_Data.csv\n",
      "2009_ERCOT_Hourly_Load_Data.csv\n",
      "2010_ERCOT_Hourly_Load_Data.csv\n",
      "2011_ERCOT_Hourly_Load_Data.csv\n",
      "2012_ERCOT_Hourly_Load_Data.csv\n",
      "2013_ERCOT_Hourly_Load_Data.csv\n",
      "2014_ERCOT_Hourly_Load_Data.csv\n",
      "native_Load_2015.csv\n",
      "native_Load_2016.csv\n",
      "native_Load_2017.csv\n",
      "Native_Load_2018.csv\n",
      "Native_Load_2019.csv\n",
      "Native_Load_2020.csv\n",
      "Native_Load_2021.csv\n"
     ]
    }
   ],
   "source": [
    "# optionally drop only NaN\n",
    "# df_list[\"native_Load_2016.csv\"].drop(7463, inplace=True)\n",
    "\n",
    "for n,x in df_list.items():\n",
    "    print(n)\n",
    "    for name in list(x):\n",
    "        if name != 'ds':\n",
    "            df_list[n][name] = pd.to_numeric(x[name].astype(str).str.replace(',', ''), errors='coerce')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "3974e9a4-92de-436b-8c81-109272f76959",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1/1/2004 1:59\n",
      "2004-01-01 02:00:00\n",
      "01/01/2017 24:00\n",
      "2017-01-02 00:00:00\n"
     ]
    }
   ],
   "source": [
    "def my_to_datetime(date_str):\n",
    "    if date_str[11:13] != '24':\n",
    "        out = pd.to_datetime(date_str)\n",
    "    else:\n",
    "        date_str = date_str[0:11] + '00' + date_str[13:]\n",
    "        out = pd.to_datetime(date_str) + dt.timedelta(days=1)\n",
    "    if out.minute != 00:\n",
    "        out = out.replace(minute=0)\n",
    "        out = out + dt.timedelta(hours=1)\n",
    "    return out\n",
    "    \n",
    "x1 = df_list['2004_ERCOT_Hourly_Load_Data.csv']['ds'][1]\n",
    "print(x1)\n",
    "print(my_to_datetime(x1))    \n",
    "x2 = df_list['native_Load_2017.csv']['ds'][23]\n",
    "print(x2)\n",
    "print(my_to_datetime(x2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2dfc8389-42cb-4f4c-8fb3-2984b6755761",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2004_ERCOT_Hourly_Load_Data.csv\n",
      "2005_ERCOT_Hourly_Load_Data.csv\n",
      "2006_ERCOT_Hourly_Load_Data.csv\n",
      "2007_ERCOT_Hourly_Load_Data.csv\n",
      "2008_ERCOT_Hourly_Load_Data.csv\n",
      "2009_ERCOT_Hourly_Load_Data.csv\n",
      "2010_ERCOT_Hourly_Load_Data.csv\n",
      "2011_ERCOT_Hourly_Load_Data.csv\n",
      "2012_ERCOT_Hourly_Load_Data.csv\n",
      "2013_ERCOT_Hourly_Load_Data.csv\n",
      "2014_ERCOT_Hourly_Load_Data.csv\n",
      "native_Load_2015.csv\n",
      "native_Load_2016.csv\n",
      "native_Load_2017.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tabletop/venv/np-dev/lib/python3.9/site-packages/dateutil/parser/_parser.py:1207: UnknownTimezoneWarning: tzname DST identified but not understood.  Pass `tzinfos` argument in order to correctly return a timezone-aware datetime.  In a future version, this will raise an exception.\n",
      "  warnings.warn(\"tzname {tzname} identified but not understood.  \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Native_Load_2018.csv\n",
      "Native_Load_2019.csv\n",
      "Native_Load_2020.csv\n",
      "Native_Load_2021.csv\n"
     ]
    }
   ],
   "source": [
    "for n,x in df_list.items():\n",
    "    print(n)\n",
    "    df_list[n]['ds'] = x['ds'].apply(my_to_datetime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "408ee664-270f-406d-b397-a83b01180ff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df_list['2004_ERCOT_Hourly_Load_Data.csv']\n",
    "del df_list['2004_ERCOT_Hourly_Load_Data.csv']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "428cf9c8-cb7f-4406-b055-fc1a49b57d62",
   "metadata": {},
   "outputs": [],
   "source": [
    "for n,x in df_list.items():\n",
    "    df = df.append(x)\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "83240bd8-8767-4fd1-bac8-3dc04bb99db3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17\n"
     ]
    }
   ],
   "source": [
    "before = len(df)\n",
    "to_remove = df[\"ds\"].duplicated(keep=\"first\")\n",
    "df = df[~to_remove]\n",
    "after = len(df)\n",
    "print(before - after)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f9b085e3-d659-4ca9-a976-75d558bf364a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(data_dir + \"ERCOT_load_2004_2021Sept.csv\", index=False, float_format='%.2f')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "921343a3-b89c-4374-bf97-53e516b47c13",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "np-dev",
   "language": "python",
   "name": "np-dev"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
